About this notebook This notebook is part of analysis of evaluate the impact of different reference bundle on scRNA-Seq quantification. # Annotation First we need to use package rtracklayer to load several gene annotation gtf files. * Ensembl * refseq * gencodeV27
library('ggplot2')
library('rtracklayer')
system.time(gtf_gencode_comp <- readGFF("~/Documents/HCA/reference/refs/gencode.v27.chr_patch_hapl_scaff.annotation.gtf", version=2L, tags = c("gene_name","gene_id", "transcript_id","gene_type")))
user system elapsed
16.114 0.993 17.335
system.time(gtf_gencode_basic <- readGFF("~/Documents/HCA/reference/refs/gencode.v27.chr_patch_hapl_scaff.basic.annotation.gtf", version=2L, tags = c("gene_name","gene_id", "transcript_id","gene_type")))
user system elapsed
10.027 0.576 10.658
system.time(gtf_ensembl <- readGFF("~/Documents/HCA/reference/refs/Homo_sapiens.GRCh38.90.gtf", version=2L,tags = c("gene_name","gene_id", "transcript_id","gene_biotype")))
user system elapsed
14.738 0.773 15.573
system.time(gtf_refseq <- readGFF("~/Documents/HCA/reference/refs/ncbi-genomes-2017-10-05/GCF_000001405.37_GRCh38.p11_genomic.gff.gz", tags = c("ID", "Name","gbkey","gene","gene_biotype","Parent")))
user system elapsed
15.570 0.585 16.162
some header of gtf files
tags<-c('GeneID','Chr','Start','End','Strand','Length','Counts')
head(gtf_ensembl)
a set of gene quantification results is shown as below. First the gene counts from ensembl bundle. The SRR1294900_25_GRCh38_Ensembl.gene.unq.counts.txt is generated from featureCounts with unique option
ensembl.counts<-read.delim('/Users/jishuxu/Documents/HCA/reference/counts/SRR1294900_25_GRCh38_Ensembl.gene.unq.counts.txt',sep='\t',header=T,skip=1)
colnames(ensembl.counts)<-tags
mlist<-match(ensembl.counts$GeneID,gtf_ensembl$gene_id)
geneName<-gtf_ensembl$gene_name[mlist]
btype<-gtf_ensembl$gene_biotype[mlist]
c1<-data.frame("geneName"=geneName,'Ensembl.Counts'=log(ensembl.counts$Counts+1,base=2),'Ensembl.type'=gtf_ensembl$gene_biotype[mlist])
head(c1)
If we define detected genes as genes with coverage >0 and good coverage genes as genes with coverage >5. Then we can plot the top 10 biotype categories.
gene.detects<-list()
t1<-sort(table(c1[c1$Ensembl.Counts>0,3]),decreasing=T)
t2<-sort(table(c1[c1$Ensembl.Counts>5,3]),decreasing=T)
gene.detects[['ensembl']]<-cbind('detected.genes'=t1[1:10],'good.genes'=t2[1:10])
RefSeq bundle resultsThen the gene counts from RefSeq.
refseq.counts<-read.delim('/Users/jishuxu/Documents/HCA/reference/counts/SRR1294900_25_GRCh38_RefSeq.gene.unq.counts.txt',sep='\t',header=T,skip=1)
colnames(refseq.counts)<-tags
mlist<-match(refseq.counts$GeneID,gtf_refseq$ID)
geneName<-gtf_refseq$Name[mlist]
c2<-data.frame("geneName"=geneName,"RefSeq.Counts"=log(refseq.counts$Counts+1,base=2),'RefSeq.type'=gtf_refseq$gene_biotype[mlist])
head(c2)
summary detected genes and good genes based on biotype.
t1<-sort(table(c2[c2$RefSeq.Counts>0,3]),decreasing=T)
t2<-sort(table(c2[c2$RefSeq.Counts>5,3]),decreasing=T)
gene.detects[['refseq']]<-cbind('detected.genes'=t1[1:10],'good.genes'=t2[1:10])
gencode bundle resultsGencodeV27 Comprehansive and GencodeV27 Basic. Repeat the similar analysis.gencode.counts.comp<-read.delim('/Users/jishuxu/Documents/HCA/reference/counts/SRR1294900_25_GRCh38_GencodeV27.gene.unq.counts.txt',sep='\t',header=T,skip=1)
colnames(gencode.counts.comp)<-tags
mlist<-match(gencode.counts.comp$GeneID,gtf_gencode_comp$gene_id)
geneName<-gtf_gencode_comp$gene_name[mlist]
c3<-data.frame("geneName"=geneName,"GencodeComp.Counts"=log(gencode.counts.comp$Counts+1,base=2),'Gencode.type'=gtf_gencode_comp$gene_type[mlist])
dim(c3)
[1] 63925 3
gencode.counts.basic<-read.delim('/Users/jishuxu/Documents/HCA/reference/counts/SRR1294900_25_GRCh38_GencodeV27_basic.gene.unq.counts.txt',sep='\t',header=T,skip=1)
colnames(gencode.counts.basic)<-tags
mlist<-match(gencode.counts.basic$GeneID,gtf_gencode_basic$gene_id)
geneName<-gtf_gencode_basic$gene_name[mlist]
c4<-data.frame("geneName"=geneName,"GencodeBasic.Counts"=log(gencode.counts.basic$Counts+1,base=2),'Gencode.type'=gtf_gencode_basic$gene_type[mlist])
dim(c4)
[1] 63925 3
Comprehansive GencodeV27t1<-sort(table(c3[c3$GencodeComp.Counts>0,3]),decreasing=T)
t2<-sort(table(c3[c3$GencodeComp.Counts >5,3]),decreasing=T)
gene.detects[['GencodeComp']]<-cbind('detected.genes'=t1[1:10],'good.genes'=t2[1:10])
Basic GencodeV27t1<-sort(table(c4[c4$GencodeBasic.Counts>0,3]),decreasing=T)
t2<-sort(table(c4[c4$GencodeBasic.Counts >5,3]),decreasing=T)
gene.detects[["GencodeBasic"]]<-cbind('detected.genes'=t1[1:10],'good.genes'=t2[1:10])
Now we can summary on detected genes by biotype cross alll 4 reference bundle.
##top 3 biotype
library(reshape2)
bios<-c('protein_coding','processed_pseudogene','pseudogene','lincRNA','lncRNA')
g.ensembl<-subset(gene.detects$ensembl,rownames(gene.detects$ensembl) %in% bios)
g.refseq<-subset(gene.detects$refseq,rownames(gene.detects$refseq) %in% bios)
g.gencode<-subset(gene.detects$GencodeComp,rownames(gene.detects$GencodeComp) %in% bios)
g.gencodeBasic<-subset(gene.detects$GencodeBasic,rownames(gene.detects$GencodeBasic) %in% bios)
detects<-data.frame('biotype'=rep(c('protein_coding','pseudogene','lncRNA'),4),rbind(g.ensembl,g.refseq,g.gencode,g.gencodeBasic),'bundle'=rep(c('Ensembl','RefSeq','Gencode','GencodeV27'),c(3,3,3,3)))
some row.names duplicated: 4,7,8,9,10,11,12 --> row.names NOT used
m.detects<-melt(detects)
Using biotype, bundle as id variables
##detected >0X
ggplot(data=m.detects,aes(x=biotype,y=value,fill=bundle))+geom_bar(stat="identity",position=position_dodge())+facet_grid(variable~.,scales="free_y")+geom_text(aes(biotype, value, label = value),position = position_dodge(width = 1))
Dropout, overdispersion eventsEnsembl vs RefSeqWe compare the gene quantification from Ensembl and RefSeq reference bundles. We are looking for 3 type of events, drop-out if only detectable from one reference bundle but not another, predictable if one can be predicted or estimated by another or the observation in one result can fall into prediction interval based on regression model,overdispersion if neither dropout nor predictable, overdispersion generally represent some genes have very high expression in one results but very low expression in another. - First merge two quantification by geneName or geneID.
ensembl.refseq<-merge(c1,c2,by=c('geneName'),all=F,all.x=F,all.y=F)
dim(ensembl.refseq)
[1] 32647 5
Overlapping genes ID between Ensembl and RefSeq is 32647. - Let’s examine the association between Ensembl and RefSeq quantification. Before the regression analysis, we need to remove drop-out genes from both ensembl and refseq.
subdata<-subset(ensembl.refseq,Ensembl.Counts>0 & RefSeq.Counts >0)
fit<-lm(RefSeq.Counts~Ensembl.Counts, data=subdata)
summary(fit)
Call:
lm(formula = RefSeq.Counts ~ Ensembl.Counts, data = subdata)
Residuals:
Min 1Q Median 3Q Max
-9.6304 -0.0886 0.0881 0.1788 7.6481
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.16485 0.02844 5.796 7.22e-09 ***
Ensembl.Counts 0.96185 0.00410 234.607 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8881 on 4967 degrees of freedom
Multiple R-squared: 0.9172, Adjusted R-squared: 0.9172
F-statistic: 5.504e+04 on 1 and 4967 DF, p-value: < 2.2e-16
The linear model is proper.
pred<-predict(fit,data=ensembl.refseq,level=0)
conf_interval_3 <- predict(fit, newdata=data.frame(Ensembl.Counts=100), interval="prediction",level = 0.95)
conf_interval_3
fit lwr upr
1 96.34994 94.45249 98.24739
Then for regression line
conf_interval <- predict(fit, newdata=ensembl.refseq, interval="prediction",level = 0.95)
If data point can not fall into this 95% prediction intervals, it should be counted as overdispersion - let’s plot results with scatterplot
##this function just label model in plot
equation = function(x) {
lm_coef <- list(a = round(coef(x)[1], digits = 2),
b = round(coef(x)[2], digits = 2),
r2 = round(summary(x)$r.squared, digits = 2));
lm_eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(R)^2~"="~r2,lm_coef)
as.character(as.expression(lm_eq));
}
##label with color
type<-rep('normal',nrow(ensembl.refseq))
type[ensembl.refseq$Ensembl.Counts==0 & ensembl.refseq$RefSeq.Counts>0]<-'dropout.ensembl'
type[ensembl.refseq$Ensembl.Counts>0 & ensembl.refseq$RefSeq.Counts==0]<-'dropout.refseq'
type[(ensembl.refseq$RefSeq.Counts > conf_interval[,3]|ensembl.refseq$RefSeq.Counts<conf_interval[,2])&(ensembl.refseq$Ensembl.Counts>0 & ensembl.refseq$RefSeq.Counts>0)]<-'overdispersion'
ensembl.refseq$type<-type
## label prefiction model in plot
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, list(a = format(coef(fit)[1], digits = 2), b = format(coef(fit)[2], digits = 2), r2 = format(summary(fit)$r.squared, digits = 3)))
## scatter plot
p1 <- ggplot(ensembl.refseq,aes(x=Ensembl.Counts,y=RefSeq.Counts),color="grey")
p1<-p1+geom_ribbon(aes(ymin=conf_interval[,2],ymax=conf_interval[,3]),alpha=0.2,fill="red")
p1<-p1+geom_point(data=ensembl.refseq,aes(x=Ensembl.Counts,y=RefSeq.Counts,color=type))
p1<-p1+annotate("text", x = 4.1, y = 12, label = equation(fit), parse = TRUE)
p1<-p1+xlab("log2(Ensembl.Counts+1)")+ylab("log2(RefSeq+1)")
p1
Ensembl compared to RefSeqdropout.ensembl<-subset(ensembl.refseq,Ensembl.Counts==0 & RefSeq.Counts>0)
dim(dropout.ensembl)
[1] 325 6
## barplot
x<-data.frame(sort(table(dropout.ensembl$Ensembl.type),decreasing=T)[1:10])
colnames(x)<-c('Biotype','Counts')
ggplot(data.frame(x),aes(x=Biotype,y=Counts))+geom_bar(aes(fill = Biotype), position = "dodge", stat="identity")+xlab('Biotype')+ylab("# of dropout genes")+geom_text(aes(label=Counts), vjust=1.6, color="black", size=3.5) +theme(axis.text.x=element_blank())+ggtitle('dropout genes in Ensembl')
dropout genes in Ensembl have any biological functional meaning. We overlap dropout genes with MSigDB C2_curated geneset, C2_curated geneset contain most KEGG pathway. two way to calculate the geneset overlapping, Fisher-exact test and hyper-geometry test. We use both tests.library(GeneOverlap)
library(MSigDB)
## create function to do overlapping test
gsOverlap<-function(gs_cureated,gs){
##msigdb_c2<-MSigDB[["C2_CURATED"]]
pvals<-c()
for(i in names(gs_curated)){
go.obj <- newGeneOverlap(gs_curated[[i]], gs, genome.size=60000)
##fisher exact
go.obj <- testGeneOverlap(go.obj)
overlap<-length(getIntersection(go.obj))
PopSize<-60000
f.pval<-getPval(go.obj)
list1<-length(gs)
list2<-length(gs_curated[[i]])
##hyperGT
if(overlap>1){
h.pval<-phyper(overlap-1,list1,PopSize-list1,list2,lower.tail = FALSE, log.p = FALSE)
}else{
h.pval<-1
}
pvals<-rbind(pvals,c(i,overlap,list1,list2,f.pval,h.pval))
}
colnames(pvals)<-c('gsName','overlap','dropout.gs.length','msigdb.gs.length','fisher','hyper')
pvals<-data.frame(pvals)
pvals$fisher.adj<-p.adjust(pvals$fisher)
pvals$hyper.adj<-p.adjust(pvals$hyper)
return(pvals)
}
## do test : ensembl
gs_curated<-MSigDB[["C2_CURATED"]]
gs<-dropout.ensembl$geneName
res<-gsOverlap(gs_curated,gs)
head(res)
sum(res$fisher.adj<0.05)
[1] 0
sum(res$hyper.adj<0.05)
[1] 0
## hypherGtest
So there is no significant enrichment with any genesets in C2. And repeat this analysis with the dropout genes in RefSeq
dropout.refseq<-subset(ensembl.refseq,Ensembl.Counts>0 & RefSeq.Counts==0)
dim(dropout.refseq)
[1] 954 6
##barplot(sort(table(dropout.refseq$RefSeq.type),decreasing=T)[1:10],main="Dropout genes: RefSeq")
x<-data.frame(sort(table(dropout.refseq$RefSeq.type),decreasing=T)[1:10])
colnames(x)<-c('Biotype','Counts')
ggplot(data.frame(x),aes(x=Biotype,y=Counts))+geom_bar(aes(fill = Biotype), position = "dodge", stat="identity")+xlab('Biotype')+ylab("# of dropout genes")+geom_text(aes(label=Counts), vjust=1.6, color="black", size=3.5) +theme(axis.text.x=element_blank())+ggtitle('dropout genes in RefSeq')
RefSeq’s dropout genes have any functional meaning?
gs<-dropout.refseq$geneName
res<-gsOverlap(gs_curated,gs)
head(res)
sum(res$fisher.adj<0.05)
[1] 0
sum(res$hyper.adj<0.05)
[1] 0
GencodeV27Basic vs refSeqGencodeV27Basic with RefSeq.gencode.refseq<-merge(c2,c4,by=c('geneName'),all=F, all.x=F,all.y=F)
dim(gencode.refseq)
[1] 57943 5
subdata<-subset(gencode.refseq,GencodeBasic.Counts>0 & RefSeq.Counts >0)
fit<-lm(RefSeq.Counts~GencodeBasic.Counts, data=subdata)
summary(fit)
Call:
lm(formula = RefSeq.Counts ~ GencodeBasic.Counts, data = subdata)
Residuals:
Min 1Q Median 3Q Max
-9.4115 -0.1340 -0.0216 0.0356 9.3393
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.214134 0.021139 10.13 <2e-16 ***
GencodeBasic.Counts 0.973297 0.003087 315.27 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6724 on 4939 degrees of freedom
Multiple R-squared: 0.9527, Adjusted R-squared: 0.9527
F-statistic: 9.94e+04 on 1 and 4939 DF, p-value: < 2.2e-16
pred<-predict(fit,data=gencode.refseq)
conf_interval_3 <- predict(fit, newdata=data.frame(GencodeBasic.Counts=100), interval="prediction",level = 0.95)
conf_interval_3
fit lwr upr
1 97.54379 96.10816 98.97943
95% prediction intervals for regression line
conf_interval <- predict(fit, newdata=gencode.refseq, interval="prediction",level = 0.95)
##label with color
type<-rep('normal',nrow(gencode.refseq))
type[gencode.refseq$GencodeBasic.Counts==0 & gencode.refseq$RefSeq.Counts>0]<-'dropout.gencode'
type[gencode.refseq$GencodeBasic.Counts>0 & gencode.refseq$RefSeq.Counts==0]<-'dropout.refseq'
type[(gencode.refseq$RefSeq.Counts > conf_interval[,3] | gencode.refseq$RefSeq.Counts<conf_interval[,2])&(gencode.refseq$GencodeBasic.Counts>0 & gencode.refseq$RefSeq.Counts>0)]<-'overdispersion'
gencode.refseq$type<-type
table(gencode.refseq$type)
dropout.gencode dropout.refseq normal overdispersion
483 367 56918 175
## label prefiction model in plot
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, list(a = format(coef(fit)[1], digits = 2), b = format(coef(fit)[2], digits = 2), r2 = format(summary(fit)$r.squared, digits = 3)))
## scatter plot
p1 <- ggplot(gencode.refseq,aes(x=GencodeBasic.Counts,y=RefSeq.Counts),color="grey")
p1<-p1+geom_ribbon(aes(ymin=conf_interval[,2],ymax=conf_interval[,3]),alpha=0.2,fill="red")
p1<-p1+geom_point(data=gencode.refseq,aes(x=GencodeBasic.Counts,y=RefSeq.Counts,color=type))
p1<-p1+annotate("text", x = 4.1, y = 12, label = equation(fit), parse = TRUE)
p1<-p1+xlab("log2(GencodeBasic.Counts+1)")+ylab("log2(RefSeq.Counts+1)")
p1
GencodeV27 basic compared to RefSeqdropout.gencode<-subset(gencode.refseq,GencodeBasic.Counts==0 & RefSeq.Counts>0)
dim(dropout.gencode)
[1] 483 6
#barplot(sort(table(dropout.gencode$Gencode.type),decreasing=T)[1:10],main="Dropout genes: GencodeBasic")
x<-data.frame(sort(table(dropout.gencode$Gencode.type),decreasing=T)[1:10])
colnames(x)<-c('Biotype','Counts')
ggplot(data.frame(x),aes(x=Biotype,y=Counts))+geom_bar(aes(fill = Biotype), position = "dodge", stat="identity")+xlab('Biotype')+ylab("# of dropout genes")+geom_text(aes(label=Counts), vjust=1.6, color="black", size=3.5) +theme(axis.text.x=element_blank())+ggtitle('dropout gene in Gencode')
whether dropout genes have any functional meaning?
gs<-dropout.gencode$geneName
gs_curated<-MSigDB[["C2_CURATED"]]
res<-gsOverlap(gs_curated,gs)
head(res)
sum(res$fisher.adj<0.05)
[1] 0
sum(res$hyper.adj<0.05)
[1] 0
Then the drop-out genes in RefSeq compared to GencodeV27 Basic
dropout.refseq<-subset(gencode.refseq,GencodeBasic.Counts>0 & RefSeq.Counts==0)
dim(dropout.refseq)
[1] 367 6
##barplot(sort(table(dropout.refseq$RefSeq.type),decreasing=T)[1:10],main="Dropout genes: RefSeq")
x<-data.frame(sort(table(dropout.refseq$RefSeq.type),decreasing=T)[1:10])
colnames(x)<-c('Biotype','Counts')
ggplot(data.frame(x),aes(x=Biotype,y=Counts))+geom_bar(aes(fill = Biotype), position = "dodge", stat="identity")+xlab('Biotype')+ylab("# of dropout genes")+geom_text(aes(label=Counts), vjust=1.6, color="black", size=3.5) +theme(axis.text.x=element_blank())+ggtitle('dropout genes in refseq')
Whether dropout genes have functional meaning?
gs<-dropout.refseq$geneName
gs_curated<-MSigDB[["C2_CURATED"]]
res<-gsOverlap(gs_curated,gs)
head(res)
sum(res$fisher.adj<0.05)
[1] 0
sum(res$hyper.adj<0.05)
[1] 0
dectected genes between reference bundles.library('VennDiagram')
g1<-c1[c1$Ensembl.Counts>0,1]
g2<-c2[c2$RefSeq.Counts>0,1]
g3<-c3[c3$GencodeComp.Counts>0,1]
g4<-c4[c4$GencodeBasic.Counts>0,1]
venn.plot<-venn.diagram(x = list('Ensembl'=g1,'RefSeq'=g2,'Gencode'=g3,'GencodeBasic'=g4), imagetype='png',filename = "Venn.png",
col = "transparent", fill = c("cornflowerblue","green","yellow","darkorchid1"),
label.col = c("orange", "white", "darkorchid4", "white", "white",
"white", "white", "white", "darkblue", "white", "white", "white", "white",
"darkgreen", "white"),alpha = 0.50,cex = 1.5, fontfamily = "serif", fontface = "bold",
cat.col = c("darkblue", "darkgreen", "orange", "darkorchid4"), cat.cex = 1.5,
cat.pos = 0, cat.dist = 0.07, cat.fontfamily = "serif", rotation.degree = 270,
margin = 0.2)
- we evaluate the 4-way overlapping on
high coverage genes (>5X) between reference bundles.
g1<-c1[c1$Ensembl.Counts>5,1]
g2<-c2[c2$RefSeq.Counts>5,1]
g3<-c3[c3$GencodeComp.Counts>5,1]
g4<-c4[c4$GencodeBasic.Counts>5,1]
venn.plot<-venn.diagram(x = list('Ensembl'=g1,'RefSeq'=g2,'Gencode'=g3,'GencodeBasic'=g4), imagetype='png',filename = "Venn_good_genes.png",
col = "transparent", fill = c("cornflowerblue","green","yellow","darkorchid1"),
label.col = c("orange", "white", "darkorchid4", "white", "white",
"white", "white", "white", "darkblue", "white", "white", "white", "white",
"darkgreen", "white"),alpha = 0.50,cex = 1.5, fontfamily = "serif", fontface = "bold",
cat.col = c("darkblue", "darkgreen", "orange", "darkorchid4"), cat.cex = 1.5,
cat.pos = 0, cat.dist = 0.07, cat.fontfamily = "serif", rotation.degree = 270,
margin = 0.2)
high covered genes